* --------------------------------------------------------------------------------------
* This script cleans data from the BLS
*
* BLS data - https://download.bls.gov/pub/time.series/overview.txt
* - LN: Labor Force Statistics (National) - https://download.bls.gov/pub/time.series/ln
* - LA: Local Area Unemployment (State and County) - https://download.bls.gov/pub/time.series/la
* - CU: CPI (U.S. city average, regional) - https://download.bls.gov/pub/time.series/cu

* --------------------------------------------------------------------------------------

* Settings
version 16
do "$SSDIMed/scripts/_auxiliary/_project_settings.do"


* --------------------------------------------------------------------------------------
* Data cleaning

* LN: BLS Labor Force Statistics from the Current Population Survey (NAICS)
if 1 {
  * Mapping files
  *   series: BLS Labor Force Statistics Series names and information
  *   lfst:   BLS Labor Force Statistics LFST information
  *   ages:   BLS Labor Force Statistics Ages information
  local series_id series_id
  local lfst_id lfst_code
  local ages_id ages_code
  foreach file in series lfst ages {
    import delimited "$SSDIMed/data/raw/bls/labor_force/ln.`file'", clear varnames(1)
    
    * Trim string variables
    ds, has(type string) 
    local strvars = "`r(varlist)'"
    foreach var of local strvars {
      replace `var' = strtrim(`var')
    }
    
    * QC:
    assert !missing(``file'_id')
    gisid ``file'_id'

    * Save
    compress
    !mkdir -p "$SSDIMed/data/proc/bls/labor_force/"
    save "$SSDIMed/data/proc/bls/labor_force/ln.`file'.dta", replace
  }
  
  * BLS Labor Force Statistics Data
  if 1 {
    import delimited "$SSDIMed/data/raw/bls/labor_force/ln.data.1.AllData", clear

    * Trim string variables
    ds, has(type string) 
    local strvars = "`r(varlist)'"
    foreach var of local strvars {
      replace `var' = strtrim(`var')
    }

    * QC:
    assert !missing(series_id)
    gisid series_id year period

    * Add series names and information
    local keepusing keepusing(lfst_code ages_code periodicity_code series_title seasonal)
    merge m:1 series_id using "$SSDIMed/data/proc/bls/labor_force/ln.series.dta", assert(match) nogen noreport `keepusing'

    * Add lfst information
    merge m:1 lfst_code using "$SSDIMed/data/proc/bls/labor_force/ln.lfst.dta", assert(match using) keep(match) nogen noreport

    * Add ages information
    merge m:1 ages_code using "$SSDIMed/data/proc/bls/labor_force/ln.ages.dta", assert(match using) keep(match) nogen noreport

    * List some sample obs
    tab series_title if series_id == "LNS14000000"
    list year period value if series_id == "LNS14000000"
    tab year period if series_id == "LNS14000000"
    
    * label select variables
    label var year "Year of observation"
    label var period "Period of observation"

    * Save
    bys series_id year period: assert _N == 1
    compress
    save "$SSDIMed/data/proc/bls/labor_force/ln.data.dta", replace
  }
  
  * For analysis: National, annual labor force stats
  if 1 {
    * Labor Force Statistics from the Current Population Survey
    * ref: https://download.bls.gov/pub/time.series/ln/ln.lfst
    
    use "$SSDIMed/data/proc/bls/labor_force/ln.data.dta", clear
    
    * Focal series 
    local series_1_id LNS14000000
    local series_1_title (Seas) Unemployment Rate
    local series_1_var unemp_rate_us
    local series_1_label Unemployment Rate (US,  monthly values, seasonally adj.)
    
    local series_2_id LNS12300000
    local series_2_title (Seas) Employment-Population Ratio
    local series_2_var emppop_us
    local series_2_label Employment-Population Ratio (US,  monthly values, seasonally adj.)
    
    local periodicity M
    
    * QC
    assert series_title == "`series_1_title'" if series_id == "`series_1_id'"
    assert series_title == "`series_2_title'" if series_id == "`series_2_id'"
    
    * keep focal series over selected years
    keep if inlist(series_id, "`series_1_id'", "`series_2_id'")
    keep if year <= 2019
    
    * QC
    qui sum year
    assert r(min) == 1948
    assert periodicity_code == "`periodicity'"
    assert ages_text == "16 years and over"
    by series_id year: assert (_N == 12) 
    
    * Date: month of year
    gen m = real(substr(period, 2, 3))
    assert inrange(m, 1, 12)
    gen yearmonth = ym(year, m)
    format yearmonth %tm
    label var yearmonth "year-month of observation"
    assert month(dofm(yearmonth)) == m
    drop m
    order series_id year period yearmonth
    
    * Destring values
    destring value, replace
    
    * Reshape wide
    drop series_title
    greshape spread value, key(series_id) i(yearmonth year period) xi(drop)
    
    * label variables
    assert "`periodicity'" == "M"
    rename `series_1_id' `series_1_var'
    rename `series_2_id' `series_2_var'
    label var `series_1_var' "`series_1_label'"
    label var `series_2_var' "`series_2_label'"
    
    * Calendar year average: average value over 12 months of calendar year containing current month
    foreach var in `series_1_var' `series_2_var' {
      gegen `var'_yyyy = mean(`var'), by(year)
      label var `var'_yyyy "(calendar yyyy mean) `var'"
    }
    
    * Moving average: average value over 12 months ending with current month
    * syntax: window(a b) -> x[i - a] to x[i + b]
    bys yearmonth: assert _N == 1
    foreach var in `series_1_var' `series_2_var' {
      local m 11
      local p 0
      gegen `var'_mov_`m'_`p' = moving_mean(`var'), window(-`m' `p') labelformat((#stat#) `var')
    }
    
    * Save
    order yearmonth year period `series_1_var'* `series_2_var'*
    compress
    bys yearmonth: assert _N == 1
    save "$SSDIMed/data/proc/bls/labor_force/bls_labor_force_us_monthly.dta", replace
    

    
  }
}

* LA: BLS Local Area Unemployment Statistics
if 1 {
  * Mapping files
  *   series:   BLS LAUS Series names and information
  *   area:     BLS LAUS Area code
  *   period:   BLS LAUS Period code
  *   measure:  BLS LAUS Measure codes (e.g., "unemployment rate", "labor force")
  local series_id series_id
  local area_id area_code
  local period_id period
  local measure_id measure_code
  foreach file in series area period measure {
    import delimited "$SSDIMed/data/raw/bls/laus/la.`file'", clear varnames(1)
    
    * Trim string variables
    ds, has(type string) 
    local strvars = "`r(varlist)'"
    foreach var of local strvars {
      replace `var' = strtrim(`var')
    }
    
    * QC:
    assert !missing(``file'_id')
    gisid ``file'_id'

    * Save
    compress
    !mkdir -p "$SSDIMed/data/proc/bls/laus/"
    save "$SSDIMed/data/proc/bls/laus/la.`file'.dta", replace
  }
  
  * BLS LAUS Data: State and County
  foreach geog in state county {
    * States, seasonally adjusted
    if "`geog'" == "state"  import delimited "$SSDIMed/data/raw/bls/laus/la.data.3.AllStatesS", clear
    if "`geog'" == "county" import delimited "$SSDIMed/data/raw/bls/laus/la.data.64.County", clear

    * Trim string variables
    ds, has(type string) 
    local strvars = "`r(varlist)'"
    foreach var of local strvars {
      replace `var' = strtrim(`var')
    }

    * QC:
    assert !missing(series_id)
    bys series_id year period: assert _N == 1

    * Add series names and information
    local keepusing keepusing(area_type_code area_code measure_code seasonal srd_code series_title)
    merge m:1 series_id using "$SSDIMed/data/proc/bls/laus/la.series.dta", assert(match using) keep(match) nogen noreport `keepusing'

    * Add area names
    local keepusing keepusing(area_text)
    merge m:1 area_code using "$SSDIMed/data/proc/bls/laus/la.area.dta", assert(match using) keep(match) nogen noreport `keepusing'

    * Add period names
    merge m:1 period using "$SSDIMed/data/proc/bls/laus/la.period.dta", assert(match using) keep(match) nogen noreport

    * Add measure names
    local keepusing keepusing(measure_text)
    merge m:1 measure_code using "$SSDIMed/data/proc/bls/laus/la.measure.dta", assert(match using) keep(match) nogen noreport `keepusing'
    tab measure_text
    
    * QC
    if "`geog'" == "state" {
      assert area_type_code == "A"
      assert seasonal == "S"
      qui glevelsof area_code
      assert r(J) == 52
    }
    else if "`geog'" == "county" {
      assert area_type_code == "F"
      assert seasonal == "U"
      qui glevelsof area_code
      assert r(J) == 3222
    }
    
    * label select variables
    label var year "Year of observation"
    label var period "Period of observation"
    label var area_code "Area of observation"
    label var area_text "Area name (of area_code)"
    label var area_type_code "Area type (of area_code): F=county, A=state"
    label var period_abbr "Period name abbr (of period)"
    label var period_name "Period name (of period)"

    * Save
    bys area_code series_title year period: assert _N == 1
    compress
    save "$SSDIMed/data/proc/bls/laus/la.data.`geog'.dta", replace
  }
  
  * For analysis: State, County measures
  foreach geog in state county {
    * Local Area Unemployment Statistics (LA)
    * ref: https://download.bls.gov/pub/time.series/la.txt
    
    local series_1_label employment
    local series_1_var emp_`geog'
    local series_2_label employment-population ratio
    local series_2_var emppop_`geog'
    local series_3_label labor force
    local series_3_var lf_`geog'
    local series_4_label labor force participation rate
    local series_4_var lfpr_`geog'
    local series_5_label unemployment
    local series_5_var unemp_`geog'
    local series_6_label unemployment rate
    local series_6_var unemp_rate_`geog'
    
    use "$SSDIMed/data/proc/bls/laus/la.data.`geog'.dta", clear
    tab measure_text seasonal
    
    * QC
    gisid series_id year period
    
    * State data are seasonally adjusted; county data are not seasonally adjusted
    if "`geog'" == "state" {
      assert area_type_code == "A"
      assert seasonal == "S"
    }
    else if "`geog'" == "county" {
      assert area_type_code == "F"
      assert seasonal == "U"
    }
    gisid area_text measure_text year period
    
    * Data are monthly
    assert substr(period, 1, 1) == "M"
    
    * Inspect
    sort series_id year period
    local if if area_text == "District of Columbia" & measure_text == "unemployment rate" & year == 2012
    list series_id year period value area_type_code seasonal `if', sep(12)
    
    * Drop the "Annual Average" period (relevant for county only)
    assert (period_name == "Annual Average") == (period == "M13")
    assert area_type_code == "F" if period_name == "Annual Average"
    drop if period_name == "Annual Average"
    
    * Selected years
    keep if year <= 2019
    
    * QC
    assert inrange(real(substr(period, 2, 3)), 1, 12)
    bys series_id year: assert (_N == 12) 
    
    * Date: month of year
    gen m = real(substr(period, 2, 3))
    assert inrange(m, 1, 12)
    gen yearmonth = ym(year, m)
    format yearmonth %tm
    label var yearmonth "year-month of observation"
    assert month(dofm(yearmonth)) == m
    drop m
    order series_id year period yearmonth
    
    * No need to keep series_title
    assert lower(series_title) == lower(measure_text) + ": " + lower(area_text) + " (" + lower(seasonal) + ")"
    drop series_title
    
    * Destring values
    assert !missing(value)
    replace value = "" if value == "-"
    destring value, replace
    
    * Clean values measure_text takes on--will be the new varnames created by reshape
    forvalues s = 1/6 {
      replace measure_text = "`series_`s'_var'" if measure_text == "`series_`s'_label'"
    }
    glevelsof measure_text, clean local(keyvars)
    
    * Reshape wide
    local keyvar measure_text
    local ivars area_code yearmonth
    gisid `keyvar' `ivars'
    local ivarlist year period area_* period_*
    foreach var of varlist `ivarlist' {
      bys `ivars': assert `var' == `var'[1]
    }
    greshape spread value, key(`keyvar') i(`ivars' `ivarlist') xi(drop)
    
    * Label new variables
    forvalues s = 1/6 {
      if "`geog'" == "state"  cap label var `series_`s'_var' "`series_`s'_label' (`geog', monthly values, seasonally adj.)"
      if "`geog'" == "county" cap label var `series_`s'_var' "`series_`s'_label' (`geog', monthly values, not seasonally adj.)"
    }
    
    * Calendar year average: average value over 12 months of calendar year containing current month
    foreach var of local keyvars {
      gegen `var'_yyyy = mean(`var'), by(year)
      label var `var'_yyyy "(calendar yyyy mean) `var'"
    }
    
    * Moving average: average value over 12 months ending with current month
    * syntax: window(a b) -> x[i - a] to x[i + b]
    bys area_code yearmonth: assert _N == 1
    foreach var of local keyvars {
      local m 11
      local p 0
      gegen `var'_mov_`m'_`p' = moving_mean(`var'), by(area_code) window(-`m' `p') labelformat((#stat#) `var')
    }
    
    * Save
    foreach var of local keyvars {
      order `var'*
    }
    order area_code yearmonth area_t* year period*
    compress
    bys area_code yearmonth: assert _N == 1
	***tiny addition by Colleen 
	if "`geog'"=="county" gen fipscounty=substr(area_code,3,5)
    save "$SSDIMed/data/proc/bls/laus/bls_laus_`geog'_monthly.dta", replace
    
    
  }
  
}

* CU: BLS CPI (U.S. city average, regional)
if 1 {
  * Mapping files
  *   series: BLS CPI Series names and information
  *   period: BLS CPI Period info
  *   area:   BLS CPI Area info
  local series_id series_id
  local area_id area_code
  local period_id period
  foreach file in series area period {
    * Documentation: https://download.bls.gov/pub/time.series/cu/cu.txt
    *
    * File Structure and Format: The following represents the file format used to define cu.series. 
    * 
    * Field #/Data Element		Length		Value(Example)
    * 1.  series_id		  	17		CUSR0000SA0
    * 2.  area_code		   	4		0400
    * 3.  item_code		   	8		SA0E
    * 4.  seasonal		   	1		S or U	
    * 5.  periodicity_code	   	1		R	
    * 6.  base_code		   	1		S
    * 7.  base_period		   	20		1982-84=100	
    * 8.  begin_year		   	4		1947		
    * 9.  begin_period   		3		M01		
    * 10. end_year			4		2002
    * 11. end_period			3		M02

    import delimited "$SSDIMed/data/raw/bls/cpi/cu.`file'", clear varnames(1)

    * Trim string variables
    ds, has(type string) 
    local strvars = "`r(varlist)'"
    foreach var of local strvars {
      replace `var' = strtrim(`var')
    }

    * QC:
    assert !missing(``file'_id')
    gisid ``file'_id'

    * Save
    compress
    !mkdir -p "$SSDIMed/data/proc/bls/cpi/"
    save "$SSDIMed/data/proc/bls/cpi/cu.`file'.dta", replace
  }
  
  * BLS CPI Series Data Summaries
  if 1 {
    * Documentation: https://download.bls.gov/pub/time.series/cu/cu.txt
    *
    * File Structure and Format: The following represents the file format used to define each data file. 
    * 
    *   Field #/Data Element	Length		Value(Example)		
    *   -------------------------------------------------------
    *   1. series_id          17        CUUR0400AA0
    *   2. year                4        1966	
    *   3. period              3        M12		
    *   4. value              12        53.3	
    *   5. footnote_codes     10        It varies
    *   
    * The series_id (CUUR0400AA0) can be broken out into:
    *   survey abbreviation	  = CU    (Consumer Price Index)
    *   seasonal(code)		    = U     (S = Seasonally Adjusted, U = Unadjusted)
    *   periodicity_code	    = R     (R = Regular, S = Semi-Annual)
    *   area_code		          = 0400  (0000 = U.S. city average)
    *   item_code		          = AA0   (SA0 = All items, SAM = Medical care)

    import delimited "$SSDIMed/data/raw/bls/cpi/cu.data.2.Summaries", clear
    
    * Trim string variables
    ds, has(type string) 
    local strvars = "`r(varlist)'"
    foreach var of local strvars {
      replace `var' = strtrim(`var')
    }

    * QC
    confirm variable series_id year period value footnote_codes
    foreach series_id of local focal_series  {
      count if series_id == "`series_id'"
      assert r(N) > 1
    }

    * Add series names and information
    local keepusing keepusing(area_code item_code seasonal periodicity_code series_title)
    merge m:1 series_id using "$SSDIMed/data/proc/bls/cpi/cu.series.dta", assert(match using) keep(match) nogen noreport `keepusing'

    * Add period info
    merge m:1 period using "$SSDIMed/data/proc/bls/cpi/cu.period.dta", assert(match) nogen noreport

    * Add area info
    local keepusing keepusing(area_name)
    merge m:1 area_code using "$SSDIMed/data/proc/bls/cpi/cu.area.dta", assert(match) nogen noreport `keepusing'

    * label select variables
    label var year "Year of observation"
    label var period "Period of observation"
    label var area_code "Area of observation"
    label var area_name "Area name (of area_code)"
    label var period_abbr "Period name abbr (of period)"
    label var period_name "Period name (of period)"

    * Save
    bys series_id year period: assert _N == 1
    compress
    save "$SSDIMed/data/proc/bls/cpi/cu.data.dta", replace
  }
  
  * For analysis: 
  if 1 {
    * CPI (U.S. city average, regional) (CU)
    * ref: https://download.bls.gov/pub/time.series/cu/cu.txt
    
    use "$SSDIMed/data/proc/bls/cpi/cu.data.dta", clear
    
    * Focal series 
    local series_1_id CUSR0000SA0
    local series_1_title All items in U.S. city average, all urban consumers, seasonally adjusted
    local series_1_var cpi_all_us
    local series_1_label All items in U.S. city avg, all urban consumers, seas. adj.
    
    local series_2_id CUSR0000SAM
    local series_2_title Medical care in U.S. city average, all urban consumers, seasonally adjusted
    local series_2_var cpi_med_us
    local series_2_label Medical care in U.S. city avg, all urban consumers, seas. adj.
    
    * QC
    assert series_title == "`series_1_title'" if series_id == "`series_1_id'"
    assert series_title == "`series_2_title'" if series_id == "`series_2_id'"
    
    * keep focal series over selected years
    keep if inlist(series_id, "`series_1_id'", "`series_2_id'")
    keep if year <= 2019
    
    * QC
    qui sum year
    assert r(min) == 1947
    assert substr(period, 1, 1) == "M"
    assert inrange(real(substr(period, 2, 3)), 1, 12)
    by series_id year: assert (_N == 12) 
    assert seasonal == "S"
    
    * Date: month of year
    gen m = real(substr(period, 2, 3))
    assert inrange(m, 1, 12)
    gen yearmonth = ym(year, m)
    format yearmonth %tm
    label var yearmonth "year-month of observation"
    assert month(dofm(yearmonth)) == m
    drop m
    order series_id year period yearmonth
    
    * Destring values
    destring value, replace
    
    * Reshape wide
    local keyvar series_id
    local ivars yearmonth
    gisid `keyvar' `ivars'
    local ivarlist year period area_* period_*
    foreach var of varlist `ivarlist' {
      bys `ivars': assert `var' == `var'[1]
    }
    greshape spread value, key(`keyvar') i(`ivars' `ivarlist') xi(drop)
    
    * rename/label variables
    rename `series_1_id' `series_1_var'
    rename `series_2_id' `series_2_var'
    label var `series_1_var' "`series_1_label'"
    label var `series_2_var' "`series_2_label'"
    
    * Calendar year average: average value over 12 months of calendar year containing current month
    foreach var in `series_1_var' `series_2_var' {
      gegen `var'_yyyy = mean(`var'), by(year)
      label var `var'_yyyy "(calendar yyyy mean) `var'"
    }
    
    * Set reference period (i.e., period for which cpi = 1)
    local ref_period = ym(2017, 12)
    local ref_period_txt = string(`ref_period', "%tm")
    di "`ref_period_txt'"
    
    gisid yearmonth
    foreach var of varlist `series_1_var'* `series_2_var'* {
      di "`var'"
      sum `var' if yearmonth == `ref_period'
      assert r(N) == 1
      replace `var' = `var'/r(mean)
      label var `var' "`: var label `var'' (ref: `ref_period_txt')"
    }
    
    * Save
    order area_code yearmonth area_n* year period* `series_1_var'* `series_2_var'*
    compress
    bys yearmonth: assert _N == 1
    save "$SSDIMed/data/proc/bls/cpi/bls_cpi_us_monthly.dta", replace
    

    
  }
}


 

** EOF
